Main analysis
# ----- SETUP -----
# Set data distribution
dd <- datadist (mi_sum)
options(datadist="dd")
# Seed for all analyses
SEED = 1408
# Save outcome vector
outcome_vector <- mi_sum$unsuccessful
Test for non-linearity
# Model formula with restricted cubic splines
model_form_rcs <- as.formula(unsuccessful ~ hiv + diabetes_yn + rcs(lab_hgb,4) + evertb + xray_cavit + smear_pos + rcs(age,4) + female + rcs(bmi,4) + dishx_any_minus + rcs(educ_years,4) + alcoholhx + drughx + smokhx + non_white)
rcs_model_si <- lrm(model_form_rcs, data=mi_sum, x=TRUE, y=TRUE)
plot(Predict(rcs_model_si, lab_hgb))
plot(Predict(rcs_model_si, age))
plot(Predict(rcs_model_si, bmi))
plot(Predict(rcs_model_si, educ_years))
anova(rcs_model_si)
Wald Statistics Response: unsuccessful
Factor Chi-Square d.f. P
hiv 10.10 1 0.0015
diabetes_yn 10.07 1 0.0015
lab_hgb 12.19 3 0.0068
Nonlinear 0.84 2 0.6578
evertb 0.78 1 0.3761
xray_cavit 0.00 1 0.9676
smear_pos 2.22 1 0.1366
age 8.63 3 0.0347
Nonlinear 6.86 2 0.0325
female 2.02 1 0.1551
bmi 2.68 3 0.4434
Nonlinear 0.19 2 0.9086
dishx_any_minus 0.47 1 0.4910
educ_years 7.15 3 0.0674
Nonlinear 1.72 2 0.4241
alcoholhx 0.29 2 0.8661
drughx 13.55 2 0.0011
smokhx 5.65 2 0.0593
non_white 1.95 1 0.1622
TOTAL NONLINEAR 9.29 8 0.3182
TOTAL 116.35 26 <.0001
Age needs non-linear term. Main analysis will use five age groups, but sensitivity analyses will use spline to compare results.
Full model
# Create age groups
mi_sum <- mi_sum %>%
mutate(age_group = fct_case_when(
age < 25 ~ "<25",
age >= 25 & age <35 ~ "25-35",
age >=35 & age <45 ~ "35-45",
age >=45 & age <55 ~ "45-55",
age >= 55 ~ "55+"
))
# Model variables with outcome variable first
model_vars <- Cs(unsuccessful, hiv, diabetes_yn, lab_hgb, evertb, xray_cavit, smear_pos, age_group, female, bmi, dishx_any_minus, educ_years, alcoholhx, drughx, smokhx, non_white)
# Length vector for formulas later
length_model_vars <- length(model_vars)
# Full model formula
model_formula <- as.formula(paste0("unsuccessful ~", (paste(model_vars[2:length_model_vars], collapse = "+"))))
# Full model
full_model <- lrm(model_formula, data = mi_sum, x = T, y = T)
# Save coefficients
coef_full_model <- full_model$coefficients
# Predicted values from full model
predvals_full_model <- predict(full_model)
# Validation of full model
full_val <- validate(full_model, B=200, seed=SEED)
Performance
Brier score, calibration slope, and calibration intercept of the full model
perf_full_model <- save_val_results(full_val)
perf_full_model
apparent corrected
Brier score 0.137 0.146
Calibration slope 1.000 0.836
Calibration intercept 0.000 -0.177
CI for apparent c-statistic
c_full_model <- c_ci(predvals_full_model, outcome_vector)
c_full_model
c_stat LB UB
[1,] 0.779 0.744 0.812
Internal validation
95% CI for the c-statistic internally validated:
c_stat_ci(full_model, data=mi_sum, samples=2000)
2.5% 97.5%
orig 0.779 0.744 0.814
corrected 0.745 0.710 0.779
More validation measures
full_val
index.orig training test optimism index.corrected n
Dxy 0.5571 0.5879 0.5180 0.0699 0.4872 200
R2 0.2276 0.2592 0.1993 0.0598 0.1678 200
Intercept 0.0000 0.0000 -0.1773 0.1773 -0.1773 200
Slope 1.0000 1.0000 0.8364 0.1636 0.8364 200
Emax 0.0000 0.0000 0.0729 0.0729 0.0729 200
D 0.1550 0.1786 0.1342 0.0444 0.1106 200
U -0.0021 -0.0021 0.0045 -0.0066 0.0045 200
Q 0.1571 0.1807 0.1297 0.0510 0.1061 200
B 0.1367 0.1318 0.1407 -0.0089 0.1456 200
g 1.2213 1.3629 1.1311 0.2318 0.9896 200
gp 0.1713 0.1822 0.1605 0.0217 0.1496 200
Bootstrapped backwards selection
The model selection function below was adapted from code provided by Heinze et al. It first fits the full model, then fits one round of backward selection, and then uses 500 bootstrap samples (sampling with replacement) to estimate selected models and then takes the median coefficient value across those samples.
Variable selection adds uncertainty to regression coefficients, which is quantified by the root mean square difference ratios. Additionally, model based uncertainty of the selected model are falsely precise and do not consider backwards selection.
RMSD ratio (RMSD of bootstrap estimates are divided by the standard error of that coefficient in the global model) quantify the uncertainty of variable selection, and expresses variance inflation or deflation caused by variable selection.
The relative conditional bias estimate quantifies how much variable-selection-induced bias one would have to expect if an IV is selected, and it increases with the uncertainty of selecting each predictor. It is computed as the difference of the mean of sampled regression coefficients computed from those samples where the variable was selected and the global model regression coefficient, divided by the global model regression coefficient. relative conditional bias <10% indicates negligible bias (evidenced by the bootstrap median as close to the global coefficient)
Using bootstrap medians and 95% are corrected for that uncertainty and can be interpreted as 95% CI obtained from sampling-based multi-model inference. The 2.5th and 97.5th percentiles can be interpreted as limits of 95% confidence intervals obtained by sampling-based multi-model inference.
# Set seed
set.seed(SEED)
# Model selection function with 500 repetitions
main_results <- model_selection(model=model_formula, data=mi_sum, boot=500, return="all", pval_pair=0.01, seed=SEED)
Main results
main_results$overview
| Estimate | Standard error | Estimate | Standard error | RMSD ratio | Bootstrap inclusion frequency (%) | Relative conditional bias (%) | Median | 2.5th percentile | 97.5% percentile | |
|---|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.922 | 0.874 | 0.909 | 0.834 | 1.134 | 100.0 | -4.23 | 0.918 | -1.216 | 2.812 |
| lab_hgb | -0.171 | 0.050 | -0.171 | 0.049 | 1.184 | 98.4 | 6.22 | -0.178 | -0.292 | -0.074 |
| hiv | 0.779 | 0.240 | 0.780 | 0.230 | 1.119 | 97.0 | 3.43 | 0.780 | 0.000 | 1.263 |
| drughxFormer | 0.399 | 0.248 | 0.402 | 0.247 | 1.135 | 96.8 | 16.00 | 0.437 | -0.045 | 0.950 |
| drughxCurrent | 1.097 | 0.293 | 1.124 | 0.285 | 1.319 | 96.8 | 9.14 | 1.157 | 0.000 | 1.877 |
| diabetes_yn | 0.696 | 0.212 | 0.690 | 0.210 | 1.204 | 95.6 | 5.22 | 0.701 | 0.000 | 1.175 |
| age_group25-35 | -0.444 | 0.260 | -0.444 | 0.259 | 1.161 | 88.6 | 11.68 | -0.456 | -0.999 | 0.009 |
| age_group35-45 | -0.639 | 0.281 | -0.644 | 0.279 | 1.248 | 88.6 | 12.03 | -0.671 | -1.280 | 0.000 |
| age_group45-55 | -1.052 | 0.337 | -1.063 | 0.335 | 1.477 | 88.6 | 11.00 | -1.102 | -1.874 | 0.000 |
| age_group55+ | -0.357 | 0.337 | -0.433 | 0.329 | 1.214 | 88.6 | 16.33 | -0.353 | -1.224 | 0.342 |
| educ_years | -0.056 | 0.024 | -0.055 | 0.024 | 1.356 | 83.0 | 20.91 | -0.060 | -0.110 | 0.000 |
| smokhxFormer | 0.580 | 0.239 | 0.590 | 0.229 | 1.334 | 82.4 | 18.60 | 0.616 | 0.000 | 1.074 |
| smokhxCurrent | 0.462 | 0.266 | 0.484 | 0.257 | 1.241 | 82.4 | 20.42 | 0.492 | 0.000 | 1.136 |
| female | -0.329 | 0.215 | -0.356 | 0.213 | 1.256 | 60.8 | 51.13 | -0.368 | -0.780 | 0.000 |
| bmi | -0.045 | 0.030 | -0.043 | 0.029 | 1.340 | 57.4 | 48.67 | -0.048 | -0.109 | 0.000 |
| smear_pos | 0.364 | 0.252 | 0.370 | 0.247 | 1.252 | 53.0 | 53.87 | 0.379 | 0.000 | 0.841 |
| non_white | 0.351 | 0.261 | 0.392 | 0.257 | 1.282 | 52.0 | 73.55 | 0.390 | 0.000 | 0.991 |
| evertb | -0.240 | 0.259 | 0.000 | 0.000 | 1.139 | 34.4 | 131.84 | 0.000 | -0.833 | 0.000 |
| dishx_any_minus | -0.210 | 0.287 | 0.000 | 0.000 | 1.179 | 34.2 | 162.40 | 0.000 | -0.882 | 0.408 |
| alcoholhxFormer | 0.136 | 0.337 | 0.000 | 0.000 | 0.903 | 22.4 | 269.12 | 0.000 | -0.248 | 0.931 |
| alcoholhxCurrent | 0.175 | 0.338 | 0.000 | 0.000 | 0.945 | 22.4 | 249.93 | 0.000 | 0.000 | 1.018 |
| xray_cavit | -0.020 | 0.193 | 0.000 | 0.000 | 0.866 | 18.0 | -348.43 | 0.000 | -0.385 | 0.414 |
Frequnecy table
The table below shows the frequency of how often models were selected during the bootstrap sampling. The boot model (including variables selected in at least 50% of bootstrap samples) was selected <2% of the time. Tied for most selected model. Shows variability of resampling procedure.
main_results$freq_table
| Predictors | count | percent | cum_percent | |
|---|---|---|---|---|
| 51 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos non_white | 9 | 1.8 | 1.8 |
| 60 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white | 9 | 1.8 | 3.6 |
| 16 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos | 8 | 1.6 | 5.2 |
| 33 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female non_white | 8 | 1.6 | 6.8 |
| 44 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi non_white | 8 | 1.6 | 8.4 |
| 5 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female | 7 | 1.4 | 9.8 |
| 25 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos | 6 | 1.2 | 11.0 |
| 107 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb | 6 | 1.2 | 12.2 |
| 125 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos dishx_any_minus | 6 | 1.2 | 13.4 |
| 138 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi non_white dishx_any_minus | 6 | 1.2 | 14.6 |
| 7 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi | 5 | 1.0 | 15.6 |
| 12 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi | 5 | 1.0 | 16.6 |
| 20 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi smear_pos | 4 | 0.8 | 17.4 |
| 39 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi non_white | 4 | 0.8 | 18.2 |
| 47 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent smear_pos non_white | 4 | 0.8 | 19.0 |
| 77 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi smear_pos evertb | 4 | 0.8 | 19.8 |
| 89 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi non_white evertb | 4 | 0.8 | 20.6 |
| 106 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb | 4 | 0.8 | 21.4 |
| 173 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb dishx_any_minus | 4 | 0.8 | 22.2 |
| 224 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi dishx_any_minus alcoholhxFormer alcoholhxCurrent | 4 | 0.8 | 23.0 |
Pairwise table
The table below informs us about the variables that are often included together (“rope teams”) or variables that may be substituted for one another (“competitors”).
- A “>” sign indicates that variables are selected together more often than would be expected by chance by multiplying their individual selection probabilities
- A “<” sign indicates that the variables were selected together less often than would be expected by chance.
main_results$pair_table
| lab_hgb | hiv | drughxFormer | drughxCurrent | diabetes_yn | age_group25-35 | age_group35-45 | age_group45-55 | age_group55+ | educ_years | smokhxFormer | smokhxCurrent | female | bmi | smear_pos | non_white | evertb | dishx_any_minus | alcoholhxFormer | alcoholhxCurrent | xray_cavit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| lab_hgb | 98.4 | 95.4 | 95.2 | 95.2 | 94 | 87.4 | 87.4 | 87.4 | 87.4 | 81.6 | 81 | 81 | 60.4 | 55.8 | 52.4 | 50.8 | 33.6 | 33.4 | 22.2 | 22.2 | 18 |
| hiv | 97 | 93.8 | 93.8 | 92.6 | 85.6 | 85.6 | 85.6 | 85.6 | 80.6 | 80.2 | 80.2 | 58.2 | 56 | 51.8 | 50.8 | 33.4 | 33.6 | 21.4 | 21.4 | 17.6 | |
| drughxFormer | 96.8 | 96.8 | 92.4 | 85.4 | 85.4 | 85.4 | 85.4 | 79.8 | 79.2 | 79.2 | 58.2 | 55.8 | 51 | 50.8 | 32.8 | 32.8 | 21.4 | 21.4 | 17.6 | ||
| drughxCurrent | > | 96.8 | 92.4 | 85.4 | 85.4 | 85.4 | 85.4 | 79.8 | 79.2 | 79.2 | 58.2 | 55.8 | 51 | 50.8 | 32.8 | 32.8 | 21.4 | 21.4 | 17.6 | ||
| diabetes_yn | 95.6 | 84.8 | 84.8 | 84.8 | 84.8 | 79.4 | 79 | 79 | 57.6 | 56 | 50.6 | 49.4 | 32.6 | 33.2 | 21.8 | 21.8 | 17.6 | ||||
| age_group25-35 | 88.6 | 88.6 | 88.6 | 88.6 | 75.4 | 74.2 | 74.2 | 54.8 | 48.2 | 47.6 | 46.6 | 30 | 32 | 19.6 | 19.6 | 15 | |||||
| age_group35-45 | > | 88.6 | 88.6 | 88.6 | 75.4 | 74.2 | 74.2 | 54.8 | 48.2 | 47.6 | 46.6 | 30 | 32 | 19.6 | 19.6 | 15 | |||||
| age_group45-55 | > | > | 88.6 | 88.6 | 75.4 | 74.2 | 74.2 | 54.8 | 48.2 | 47.6 | 46.6 | 30 | 32 | 19.6 | 19.6 | 15 | |||||
| age_group55+ | > | > | > | 88.6 | 75.4 | 74.2 | 74.2 | 54.8 | 48.2 | 47.6 | 46.6 | 30 | 32 | 19.6 | 19.6 | 15 | |||||
| educ_years | > | < | < | < | 83 | 67.6 | 67.6 | 49.8 | 46 | 44.4 | 42.4 | 29.2 | 28.4 | 18.6 | 18.6 | 15 | |||||
| smokhxFormer | 82.4 | 82.4 | 48.8 | 48.4 | 43.6 | 41.6 | 28.4 | 28.2 | 16.8 | 16.8 | 14.4 | ||||||||||
| smokhxCurrent | > | 82.4 | 48.8 | 48.4 | 43.6 | 41.6 | 28.4 | 28.2 | 16.8 | 16.8 | 14.4 | ||||||||||
| female | 60.8 | 33.2 | 35 | 31.6 | 18.2 | 20.2 | 10.4 | 10.4 | 11.4 | ||||||||||||
| bmi | > | < | < | < | 57.4 | 29 | 30.4 | 20.4 | 19.2 | 12 | 12 | 10 | |||||||||
| smear_pos | 53 | 28.4 | 18.4 | 17.8 | 10.2 | 10.2 | 10.6 | ||||||||||||||
| non_white | 52 | 18.2 | 14 | 10.6 | 10.6 | 8.2 | |||||||||||||||
| evertb | 34.4 | 12.4 | 8.2 | 8.2 | 6.4 | ||||||||||||||||
| dishx_any_minus | < | 34.2 | 7.4 | 7.4 | 6.4 | ||||||||||||||||
| alcoholhxFormer | < | 22.4 | 22.4 | 4.6 | |||||||||||||||||
| alcoholhxCurrent | < | < | 22.4 | 4.6 | |||||||||||||||||
| xray_cavit | 18 |
# Raw overview table as a dataframe
raw_table <- as.data.frame(main_results$raw_overview) %>% rownames_to_column("variable")
# Save coefficients from selected model and boot medians
coef_selected_model <- main_results$raw_overview[,"sel_est"]
coef_boot_median <- main_results$raw_overview[,"boot_median"]
Bootstrap selected model (“boot” model)
50% threshold
# Set inclusion threshold
threshold = 50
# Extract names of variables retained in boot model based on threshold above
boot_vars_50 <- raw_table %>% filter(boot_inclusion >= threshold) %>% pull(variable)
boot_vars_50 <- gsub("Former|Never|Current|25-35|35-45|45-55|[55+]", "", boot_vars_50)
boot_vars_50 <- unique(boot_vars_50)
boot_vars_50 <- boot_vars_50[-1]
boot_vars_50
[1] "lab_hgb" "hiv" "drughx" "diabetes_yn" "age_group"
[6] "educ_years" "smokhx" "female" "bmi" "smear_pos"
[11] "non_white"
# Boot model formula
boot_form_50 <- as.formula(paste0("unsuccessful ~", (paste(boot_vars_50, collapse = "+"))))
# Boot model
boot_model_50 <- lrm(boot_form_50, data = mi_sum, x = T, y = T)
# Boot model validation
boot_val_50 <- validate(boot_model_50, B=200, seed=SEED)
# Coefficients from the model refit including only variables from the 50% or more boostraps
coef_boot_vars_50 <- boot_model_50$coefficients
# C statistic (apparent and internally validated)
c_stat_ci(boot_model_50, data=mi_sum, samples=2000)
2.5% 97.5%
orig 0.778 0.740 0.812
corrected 0.753 0.719 0.788
# Slope and intercept
save_val_results(boot_val_50)
apparent corrected
Brier score 0.137 0.143
Calibration slope 1.000 0.875
Calibration intercept 0.000 -0.142
# Calibration plot
plot(calibrate(boot_model_50, B=200), xlab="Predicted Probability of Unsuccessful Outcome", ylab="Actual Proability of Unsuccessful Outcome")
n=944 Mean absolute error=0.023 Mean squared error=0.00139
0.9 Quantile of absolute error=0.034
60% threshold
# Set inclusion threshold
threshold = 60
# Extract names of variables retained in boot model based on threshold above
boot_vars_60 <- raw_table %>% filter(boot_inclusion >= threshold) %>% pull(variable)
boot_vars_60 <- gsub("Former|Never|Current|25-35|35-45|45-55|[55+]", "", boot_vars_60)
boot_vars_60 <- unique(boot_vars_60)
boot_vars_60 <- boot_vars_60[-1]
boot_vars_60
[1] "lab_hgb" "hiv" "drughx" "diabetes_yn" "age_group"
[6] "educ_years" "smokhx" "female"
# Boot model formula
boot_form_60 <- as.formula(paste0("unsuccessful ~", (paste(boot_vars_60, collapse = "+"))))
# Boot model
boot_model_60 <- lrm(boot_form_60, data = mi_sum, x = T, y = T)
# Boot model validation
boot_val_60 <- validate(boot_model_60, B=200, seed=SEED)
# Coefficients from the model refit including only variables from the 60% or more boostraps
coef_boot_vars_60 <- boot_model_60$coefficients
# C statistic (apparent and internally validated)
c_stat_ci(boot_model_60, data=mi_sum, samples=2000)
2.5% 97.5%
orig 0.773 0.739 0.807
corrected 0.752 0.718 0.789
# Slope and intercept
save_val_results(boot_val_60)
apparent corrected
Brier score 0.139 0.145
Calibration slope 1.000 0.897
Calibration intercept 0.000 -0.113
# Calibration plot
plot(calibrate(boot_model_60, B=200), xlab="Predicted Probability of Unsuccessful Outcome", ylab="Actual Proability of Unsuccessful Outcome")
n=944 Mean absolute error=0.029 Mean squared error=0.00182
0.9 Quantile of absolute error=0.036
70% threshold
# Set inclusion threshold
threshold = 70
# Extract names of variables retained in boot model based on threshold above
boot_vars_70 <- raw_table %>% filter(boot_inclusion >= threshold) %>% pull(variable)
boot_vars_70 <- gsub("Former|Never|Current|25-35|35-45|45-55|[55+]", "", boot_vars_70)
boot_vars_70 <- unique(boot_vars_70)
boot_vars_70 <- boot_vars_70[-1]
boot_vars_70
[1] "lab_hgb" "hiv" "drughx" "diabetes_yn" "age_group"
[6] "educ_years" "smokhx"
# Boot model formula
boot_form_70 <- as.formula(paste0("unsuccessful ~", (paste(boot_vars_70, collapse = "+"))))
# Boot model
boot_model_70 <- lrm(boot_form_70, data = mi_sum, x = T, y = T)
# Boot model validation
boot_val_70 <- validate(boot_model_70, B=200, seed=SEED)
# Coefficients from the model refit including only variables from the 70% or more boostraps
coef_boot_vars_70 <- boot_model_70$coefficients
# C statistic (apparent and internally validated)
c_stat_ci(boot_model_70, data=mi_sum, samples=2000)
2.5% 97.5%
orig 0.768 0.733 0.802
corrected 0.748 0.714 0.784
# Performance measures
save_val_results(boot_val_70)
apparent corrected
Brier score 0.139 0.145
Calibration slope 1.000 0.896
Calibration intercept 0.000 -0.118
# Calibration plot
plot(calibrate(boot_model_70, B=200), xlab="Predicted Probability of Unsuccessful Outcome", ylab="Actual Proability of Unsuccessful Outcome")
n=944 Mean absolute error=0.027 Mean squared error=0.00149
0.9 Quantile of absolute error=0.032
80% threshold would be same as 70%
90% threshold
# Set inclusion threshold
threshold = 90
# Extract names of variables retained in boot model based on threshold above
boot_vars_90 <- raw_table %>% filter(boot_inclusion >= threshold) %>% pull(variable)
boot_vars_90 <- gsub("Former|Never|Current|25-35|35-45|45-55|[55+]", "", boot_vars_90)
boot_vars_90 <- unique(boot_vars_90)
boot_vars_90 <- boot_vars_90[-1]
# Boot model formula
boot_form_90 <- as.formula(paste0("unsuccessful ~", (paste(boot_vars_90, collapse = "+"))))
# Boot model
boot_model_90 <- lrm(boot_form_90, data = mi_sum, x = T, y = T)
# Boot model validation
boot_val_90 <- validate(boot_model_90, B=200, seed=SEED)
# Coefficients from the model refit including only variables from the 90% or more boostraps
coef_boot_vars_90 <- boot_model_90$coefficients
# C statistic (apparent and internally validated)
c_stat_ci(boot_model_90, data=mi_sum, samples=2000)
2.5% 97.5%
orig 0.739 0.698 0.775
corrected 0.731 0.694 0.769
# Performance measures
save_val_results(boot_val_90)
apparent corrected
Brier score 0.143 0.145
Calibration slope 1.000 0.962
Calibration intercept 0.000 -0.049
# Calibration plot
plot(calibrate(boot_model_90, B=200), xlab="Predicted Probability of Unsuccessful Outcome", ylab="Actual Proability of Unsuccessful Outcome")
n=944 Mean absolute error=0.019 Mean squared error=0.00089
0.9 Quantile of absolute error=0.044
Decision
Based on the comparisons above, and based on the literature, I believe a 70% threshold is best trade-off between parsimony and performance.
# Set general terms based on threshold
boot_vars <- boot_vars_70
boot_model <- boot_model_70
boot_val <- boot_val_70
coef_boot_vars <- coef_boot_vars_70
predvals_boot <- predict(boot_model)
c_boot_model <- c_ci(predvals_boot, outcome_vector)
perf_boot_model <- save_val_results(boot_val_70)
# Coefficients and standard errors
boot_est <- coef(boot_model)
boot_var <- diag(boot_model$var)
boot_se <- sqrt(boot_var)
boot_coef_se <- as.data.frame(cbind(boot_est, boot_se)) %>% rownames_to_column("variable")
I also wrote a code that constructs a calibration plot, but does not include bias-corrected line.
# My functions for calibration plot
cal_plot_solo(mi_sum, predvals_boot, outcome_vector)
All validation measures
boot_val
index.orig training test optimism index.corrected n
Dxy 0.5367 0.5553 0.5132 0.0421 0.4946 200
R2 0.2097 0.2293 0.1933 0.0360 0.1738 200
Intercept 0.0000 0.0000 -0.1181 0.1181 -0.1181 200
Slope 1.0000 1.0000 0.8964 0.1036 0.8964 200
Emax 0.0000 0.0000 0.0459 0.0459 0.0459 200
D 0.1418 0.1565 0.1299 0.0266 0.1152 200
U -0.0021 -0.0021 0.0019 -0.0041 0.0019 200
Q 0.1439 0.1586 0.1279 0.0306 0.1133 200
B 0.1392 0.1363 0.1418 -0.0055 0.1447 200
g 1.1243 1.1993 1.0701 0.1292 0.9951 200
gp 0.1639 0.1711 0.1575 0.0135 0.1504 200
ROC curve
# Predicted risk
predrisk_boot <- 1/(1+exp(-predvals_boot))
mi_sum$predrisk_boot = predrisk_boot
# ROC curve
roc <- roc(mi_sum, unsuccessful, predrisk_boot, ci=TRUE)
roc
Call:
roc.data.frame(data = mi_sum, response = unsuccessful, predictor = predrisk_boot, ci = TRUE)
Data: predrisk_boot in 753 controls (unsuccessful 0) < 191 cases (unsuccessful 1).
Area under the curve: 0.768
95% CI: 0.733-0.803 (DeLong)
# Output plot
plot(roc)
plot(ci.thresholds(roc), type="shape", col="lightblue")
plot(roc, add=TRUE)
# Export to jpeg
jpeg(here("images", "roc_curve_boot.jpeg"), width=300, height=300)
plot(roc)
plot(ci.thresholds(roc), type="shape", col="lightblue")
plot(roc, add=TRUE)
dev.off()
quartz_off_screen
2
# Save calibration plot
jpeg(here("images", "cal_plot_boot.jpeg"), width=450, height=450)
plot(calibrate(boot_model, B=200), xlab="Predicted Probability of Unsuccessful Outcome", ylab="Actual Proability of Unsuccessful Outcome")
n=944 Mean absolute error=0.028 Mean squared error=0.00149
0.9 Quantile of absolute error=0.035
dev.off()
quartz_off_screen
2
# Alternative calibration plot
jpeg(here("images", "cal_plot_boot_v2.jpeg"), width=450, height=450)
cal_plot_solo(mi_sum, predvals_boot, outcome_vector)
dev.off()
quartz_off_screen
2
Hosmer-Lemeshow test
hoslem.test(outcome_vector, predrisk_boot, g=20)
Hosmer and Lemeshow goodness of fit (GOF) test
data: outcome_vector, predrisk_boot
X-squared = 21, df = 18, p-value = 0.3
Shrinkage from boot model
Below I use the variables from the boot model and model fit to estimate the heuristic shrinkage factor.
A heuristic shrinkage factor is suggested to improve predictions in new patients, and can be estimated as: \[\chi^2_{model} - df / \chi^2_{model}\]
# Heuristic shrinkage factor
shrinkage_factor <- (boot_model$stats["Model L.R."] - boot_model$stats["d.f."]) / (boot_model$stats["Model L.R."])
shrinkage_factor
Model L.R.
0.911
# Adjust coefficients from boot variable model by shrinkage factor
coef_shrink_model <- coef_boot_vars*shrinkage_factor
# Predicted values and predicted from boot_model*shrinkage factor
predvals_shrink <- predict(boot_model)*shrinkage_factor
predrisk_shrink <- 1/(1+exp(-predvals_shrink))
mi_sum$predrisk_shrink = predrisk_shrink
mi_sum$predvals_shrink = predvals_shrink
Performance
Brier score, calibration slope, and calibration intercept of the shrunken model:
shrink_perf <- ev_glm(method="original", lp="predvals_shrink", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
perf_shrink <- shrink_perf[Cs(Brier, Intercept, Slope), Cs(apparent, corrected)]
rownames(perf_shrink) <- c("Brier score", "Calibration slope", "Calibration intercept")
perf_shrink
apparent corrected
Brier score 0.139 0.1403
Calibration slope 0.000 -0.0127
Calibration intercept 1.098 1.0746
CI for apparent c-statistic
c_shrink <- c_ci(predvals_shrink, outcome_vector)
c_shrink
c_stat LB UB
[1,] 0.768 0.731 0.803
Calibration plot
cal_plot_solo(mi_sum, predvals_shrink, outcome_vector)
ROC curve
plot(ev_glm(method="original", lp="predvals_shrink", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
plot(roc(outcome_vector, predvals_shrink))
This is similar to the ROC curve from the non-shrunk values (coefficients from boot vars model)
Bootstrap medians
This section uses the median coefficients estimated from the bootstrap sampling to evaluate model performance.
# Select non-zero values
boot_medians <- raw_table %>%
filter(boot_median!=0) %>%
select(variable, boot_median)
# Apply formula
mi_sum <- mi_sum %>%
mutate(predvals_boot_median =
0.9182 +
-0.1783*lab_hgb +
0.7803*hiv +
0.4369*(drughx=="Former") +
1.157*(drughx=="Current") +
0.7006*diabetes_yn +
-0.4557*(age_group=="25-35") +
-0.6709*(age_group=="35-45") +
-1.1024*(age_group=="45-55") +
-0.3533*(age_group=="55+") +
-0.0598*educ_years +
0.6162*(smokhx=="Former") +
0.4924*(smokhx=="Current") +
-0.0477*bmi +
-0.3680*female +
0.3792*smear_pos +
0.3895*non_white,
predrisk_boot_median = 1/(1+exp(-predvals_boot_median))
)
mi_sum %>%
select(predvals_boot_median, predrisk_boot_median, unsuccessful) %>%
tbl_summary(by="unsuccessful")
| Characteristic | 0, N = 7531 | 1, N = 1911 |
|---|---|---|
| predvals_boot_median | -2.18 (-2.78, -1.40) | -1.04 (-1.68, -0.44) |
| predrisk_boot_median | 0.10 (0.06, 0.20) | 0.26 (0.16, 0.39) |
|
1
Statistics presented: median (IQR)
|
||
Performance
boot_med_perf <- ev_glm(method="original", lp="predvals_boot_median", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
perf_boot_median <- boot_med_perf[Cs(Brier, Intercept, Slope), Cs(apparent, corrected)]
rownames(perf_boot_median) <- c("Brier score", "Calibration slope", "Calibration intercept")
perf_boot_median
apparent corrected
Brier score 0.138 0.138
Calibration slope 0.123 0.140
Calibration intercept 0.966 0.972
CI for apparent c-statistic
predvals_boot_median = mi_sum$predvals_boot_median
c_boot_median <- c_ci(predvals_boot_median, outcome_vector)
c_boot_median
c_stat LB UB
[1,] 0.777 0.74 0.811
Calibration plot
cal_plot_solo(mi_sum, predvals_boot_median, outcome_vector)
ROC curve
plot(ev_glm(method="original", lp="predvals_boot_median", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
LASSO
LASSO with glmnet - lambda minimum and 1se were estimated using 10-fold cross validation.
# Matrix of x variable values
x_vars <- model.matrix(model_formula, mi_sum)[,-1]
# Fit lassso
lasso_fit <- glmnet(x_vars, outcome_vector, family="binomial", alpha=1, seed=SEED)
# Cross validation to select optimal lambda
cv_lasso <- cv.glmnet(x_vars, outcome_vector, family = "binomial", type.measure="auc", nfolds=10, seed=SEED)
plot(cv_lasso)
Lambda min
Using lambda min, almost all variables from the full model were included:
coef_lasso_min <- coef(cv_lasso, s="lambda.min")
# Predicted risk and values
predrisk_lasso_min <- cv_lasso %>% predict(newx = x_vars, s="lambda.min", type="response", seed=SEED)
predrisk_lasso_min_vec = as.vector(predrisk_lasso_min)
mi_sum$predrisk_lasso_min <- predrisk_lasso_min_vec
predvals_lasso_min <- cv_lasso %>% predict(newx = x_vars, s="lambda.min", seed=SEED)
predvals_lasso_min_vec = as.vector(predvals_lasso_min)
mi_sum$predvals_lasso_min <- predvals_lasso_min_vec
# Predicted outcomes lasso min
predout_lasso_min <- ifelse(predrisk_lasso_min > 0.5, 1, 0)
Performance
The lambda min model had an accuracy (defined as the concordance between predicted outcome - predicted risk >0.5 = outcome, predicted risk <=0.5 = no outcome vs. the true values of the outcome) of:
mean(predout_lasso_min == outcome_vector)
[1] 0.798
Brier score, calibration slope, and calibration intercept of the model:
lasso_min_perf <- ev_glm(method="original", lp="predvals_lasso_min", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
perf_lasso_min <- lasso_min_perf[Cs(Brier, Intercept, Slope), Cs(apparent, corrected)]
rownames(perf_lasso_min) <- c("Brier score", "Calibration slope", "Calibration intercept")
perf_lasso_min
apparent corrected
Brier score 0.137 0.1380
Calibration slope 0.053 0.0418
Calibration intercept 1.046 1.0293
CI for apparent c-statistic
c_lasso_min <- c_ci(predvals_lasso_min_vec, outcome_vector)
c_lasso_min
c_stat LB UB
[1,] 0.778 0.74 0.812
Calibration plot
And here is the calibration curve and c-statistic:
val.prob(predrisk_lasso_min, outcome_vector, cex=.5)
Dxy C (ROC) R2 D D:Chi-sq D:p U U:Chi-sq
0.55584 0.77792 0.22682 0.15440 146.75506 NA -0.00188 0.22937
U:p Q Brier Intercept Slope Emax E90 Eavg
0.89165 0.15628 0.13655 0.05350 1.04621 0.15450 0.04287 0.02198
S:z S:p
0.00986 0.99213
ROC curve
plot(ev_glm(method="original", lp="predvals_lasso_min", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
Lambda 1se
coef_lasso_1se <- coef(cv_lasso, s="lambda.1se")
# Predicted value sand risks
predrisk_lasso_1se <- cv_lasso %>% predict(newx = x_vars, s="lambda.1se", type="response", seed=SEED)
predrisk_lasso_1se_vec = as.vector(predrisk_lasso_1se)
mi_sum$predrisk_lasso_1se <- predrisk_lasso_1se_vec
predvals_lasso_1se <- cv_lasso %>% predict(newx = x_vars, s="lambda.1se", seed=SEED)
predvals_lasso_1se_vec = as.vector(predvals_lasso_1se)
mi_sum$predvals_lasso_1se <- predvals_lasso_1se_vec
# Predicted outcome
predout_lasso_1se <- ifelse(predrisk_lasso_1se > 0.5, 1, 0)
Performance
The lambda min model had an accuracy (defined as the concordance between predicted outcome - predicted risk >0.5 = outcome, predicted risk <=0.5 = no outcome vs. the true values of the outcome) of:
mean(predout_lasso_1se == outcome_vector)
[1] 0.801
Brier score, calibration slope, and calibration intercept of the full model:
lasso_1se_perf <- ev_glm(method="original", lp="predvals_lasso_1se", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
perf_lasso_1se <- lasso_1se_perf[Cs(Brier, Intercept, Slope), Cs(apparent, corrected)]
rownames(perf_lasso_1se) <- c("Brier score", "Calibration slope", "Calibration intercept")
perf_lasso_1se
apparent corrected
Brier score 0.142 0.141
Calibration slope 0.747 0.725
Calibration intercept 1.597 1.585
CI for apparent c-statistic
c_lasso_1se <- c_ci(predvals_lasso_1se_vec, outcome_vector)
c_lasso_1se
c_stat LB UB
[1,] 0.758 0.722 0.794
Calibration plot
And here is the calibration curve and c-statistic:
val.prob(predrisk_lasso_1se, outcome_vector, cex=.5)
Dxy C (ROC) R2 D D:Chi-sq D:p U U:Chi-sq
0.51566 0.75783 0.17211 0.11463 109.21028 NA 0.01500 16.15711
U:p Q Brier Intercept Slope Emax E90 Eavg
0.00031 0.09963 0.14195 0.74673 1.59709 0.13211 0.08773 0.04899
S:z S:p
-1.31431 0.18874
ROC curve
plot(ev_glm(method="original", lp="predvals_lasso_1se", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
Model approximation
The data below shows the steps of removal of variables from the full model. This is the first step to model approximation. Model all variables against the predicted values from the full model, and then can estimate how well the set of variables predicted the predicted values from the full model.
approx_full_formula <- as.formula(paste0("predvals_full_model ~", (paste(model_vars[2:length_model_vars], collapse = "+"))))
approx_full_model <- ols(approx_full_formula, sigma=1, data=mi_sum)
fastbw(approx_full_model, aics=10000)
Deleted Chi-Sq d.f. P Residual d.f. P AIC R2
xray_cavit 0.08 1 0.7770 0.08 1 0.7770 -1.92 1.000
alcoholhx 2.86 2 0.2388 2.94 3 0.4003 -3.06 0.997
dishx_any_minus 5.37 1 0.0204 8.32 4 0.0806 0.32 0.993
evertb 7.09 1 0.0077 15.41 5 0.0087 5.41 0.986
bmi 18.41 1 0.0000 33.82 6 0.0000 21.82 0.970
smear_pos 18.50 1 0.0000 52.32 7 0.0000 38.32 0.953
non_white 26.00 1 0.0000 78.31 8 0.0000 62.31 0.930
female 30.30 1 0.0000 108.61 9 0.0000 90.61 0.903
educ_years 52.73 1 0.0000 161.34 10 0.0000 141.34 0.856
hiv 61.26 1 0.0000 222.59 11 0.0000 200.59 0.801
diabetes_yn 72.61 1 0.0000 295.21 12 0.0000 271.21 0.736
age_group 66.46 4 0.0000 361.67 16 0.0000 329.67 0.676
smokhx 95.49 2 0.0000 457.16 18 0.0000 421.16 0.591
lab_hgb 250.25 1 0.0000 707.41 19 0.0000 669.41 0.367
drughx 410.34 2 0.0000 1117.75 21 0.0000 1075.75 0.000
Approximate Estimates after Deleting Factors
Coef S.E. Wald Z P
[1,] -1.671 0.03255 -51.34 0
Factors in Final Model
None
We see that the variables through age can predict the full moel with an R2 of 0.986.
The code below confirms how well the boot model approximates the predicted values from the full model.
approx_boot_formula = as.formula(paste0("predvals_full_model ~ ", (paste(boot_vars, collapse = "+"))))
approx_boot_model <- ols(approx_boot_formula, data=mi_sum, x=TRUE, y=TRUE)
coef_approx_model = coef(approx_boot_model)
approx_boot_model$stats["R2"]
R2
0.903
The mean absolute error between predicted values from the approximate model and predicted values from the full model is:
predvals_approx_mod <- predict(approx_boot_model)
mi_sum$predvals_approx_mod <- predvals_approx_mod
# Absolute error between approximated model predicted values and full model predicted values
abs_err <- mean(abs(predvals_approx_mod - predvals_full_model))
abs_err
[1] 0.272
Performance
Brier score, calibration slope, and calibration intercept of the model:
approx_perf <- ev_glm(method="original", lp="predvals_approx_mod", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
perf_approx_model <- approx_perf[Cs(Brier, Intercept, Slope), Cs(apparent, corrected)]
rownames(perf_approx_model) <- c("Brier score", "Calibration slope", "Calibration intercept")
perf_approx_model
apparent corrected
Brier score 0.139 0.1384
Calibration slope -0.007 -0.0109
Calibration intercept 0.972 0.9682
CI for apparent c-statistic:
c_approx_model <- c_ci(predvals_approx_mod, outcome_vector)
c_approx_model
c_stat LB UB
[1,] 0.768 0.732 0.802
Calibration plot
cal_plot_solo(mi_sum, predvals_approx_mod, outcome_vector)
ROC curve
plot(ev_glm(method="original", lp="predvals_approx_mod", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
Model comparison
Coefficients
# Extract names of variables retained in selected model
selected_vars_int <- names(coef_selected_model[which(coef_selected_model!=0)])
selected_vars_int <- gsub("Former|Never|Current|25-35|35-45|45-55|55+", "", selected_vars_int)
selected_vars_int <- unique(selected_vars_int)
selected_vars <- selected_vars_int[-1]
selected_form <- as.formula(paste0("unsuccessful ~", (paste(selected_vars, collapse = "+"))))
selected_model <- lrm(selected_form, data = mi_sum, x = T, y = T)
predvals_selected_model <- predict(selected_model)
c_selected_model <- c_ci(predvals_selected_model, outcome_vector)
selected_model_val <- validate(selected_model)
perf_selected_model <- save_val_results(selected_model_val)
# Extract coefficients from each modeling approach and compare
coef_full <- data.frame(coef_full_model) %>% rownames_to_column("variable") %>% rename(full_model = coef_full_model) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable))
coef_lasso_min <- data.frame(as.array(coef_lasso_min)) %>% rownames_to_column("variable") %>% rename(lasso_min = X1) %>%
mutate(variable = gsub("[(]|[)]|=|rcsage, 4", "", variable),
variable = gsub("rcsage, 4", "", variable))
coef_lasso_1se <- data.frame(as.array(coef_lasso_1se)) %>% rownames_to_column("variable") %>% rename(lasso_1se = X1) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable),
variable = gsub("rcsage, 4", "", variable))
coef_selected <- data.frame(coef_selected_model) %>% rownames_to_column("variable") %>% rename(selected_model = coef_selected_model) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable),
variable = gsub("rcsage, 4", "", variable))
coef_median <- data.frame(coef_boot_median) %>% rownames_to_column("variable") %>% rename(boot_median = coef_boot_median) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable),
variable = gsub("rcsage, 4", "", variable))
coef_boot_model <- data.frame(coef_boot_vars) %>% rownames_to_column("variable") %>% rename(boot_model = coef_boot_vars) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable))
coef_shrink <- data.frame(coef_shrink_model) %>% rownames_to_column("variable") %>% rename(coef_shrink = coef_shrink_model) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable))
coef_approx <- data.frame(coef_approx_model) %>% rownames_to_column("variable") %>% rename(approx_model = coef_approx_model) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable))
var_order <- coef_median %>% pull(variable)
coefficients_table <- Reduce(function(x,y) merge(x, y, by = "variable", all.x = TRUE, all.y = TRUE),
list(coef_full, coef_selected, coef_median, coef_boot_model, coef_shrink, coef_approx, coef_lasso_min, coef_lasso_1se)) %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
arrange(factor(variable, var_order))
coefficients_table
Performance measures
# -- C statistics ---
c_stat_list <- as.data.frame(rbind(c_full_model, c_selected_model, c_boot_median, c_boot_model, c_shrink, c_approx_model, c_lasso_min, c_lasso_1se))
rownames(c_stat_list) <- c("full_model", "selected_model" , "boot_median", "boot_model", "shrink", "approx", "lambda_min", "lambda_1se")
c_stats <- c_stat_list %>%
rownames_to_column("model") %>%
mutate_if(is.numeric, round, 2) %>%
mutate(c_ci = paste(c_stat, " (", LB, ", ", UB, ")", sep="")) %>%
select(model, c_ci)
#--- Brier score, slope, and intercept ---
performance_list <- as.data.frame(t(cbind(perf_full_model[,"corrected"], perf_selected_model[,"corrected"], perf_boot_median[,"corrected"], perf_boot_model[,"corrected"], perf_shrink[,"corrected"], perf_approx_model[,"corrected"], perf_lasso_min[,"corrected"], perf_lasso_1se[,"corrected"])))
rownames(performance_list) <- c("full_model", "selected_model", "boot_median", "boot_model", "shrink", "approx", "lambda_min", "lambda_1se")
performance <- performance_list %>%
rownames_to_column("model") %>%
rename(brier_score = "Brier score",
cal_slope = "Calibration slope",
cal_int = "Calibration intercept")
#---- Combine data ---
models_performance <- performance %>%
full_join(c_stats, by="model")
models_performance
# Transpose
t_model_performance <- as.data.frame(t(models_performance)) %>% rownames_to_column("stat")
Main results
Risk table
Based on coefficients from the boot model, here is a classification table broken down by deciles. Each row denotes the classification values for people above that decile are considered as “unsuccessful” (positive) outcome vs. “successful” (negative) outcome.
mi_sum$predvals_boot <- predvals_boot
n_tot = dim(mi_sum)[1]
risk_groups <- mi_sum %>%
mutate(risk_group = cut(predrisk_boot,
breaks = c(0, .10, 0.2, 100),
labels= c("Low risk", "Medium risk", "High risk"))) %>%
group_by(risk_group) %>%
summarize(
min_score = min(predvals_boot),
max_score = max(predvals_boot),
risk_cut_off = max(predvals_boot),
n_group = n(),
prop_group = n_group/n_tot,
outcome = sum(unsuccessful),
obs_prob_outcome = mean(unsuccessful),
pred_prob_outcome = mean(predrisk_boot))
risk_groups
Nomogram
# Set data distribution
dd <- datadist (mi_sum)
options(datadist="dd")
boot_model$coefficients
Intercept lab_hgb hiv drughx=Former drughx=Current
0.6552 -0.1786 0.7063 0.4970 1.1886
diabetes_yn age_group=25-35 age_group=35-45 age_group=45-55 age_group=55+
0.6543 -0.4815 -0.7053 -1.0856 -0.4582
educ_years smokhx=Former smokhx=Current
-0.0576 0.6335 0.5550
boot_model <- Newlabels(boot_model, c("Hemoglobin (g/dL)", "HIV", "Drug use", "Diabetes", "Age group", "Education (years)", "Tobacco use"))
nom <- nomogram(boot_model,
fun=plogis,
fun.at=c(.001,.01,.05,seq(.1,.9,by=.1),.95,.99,.999),
funlabel="Risk of unsuccessful outcome",
vnames="labels")
# Plot
plot(nom)
# Save plot
jpeg(here("images", "nomogram.jpeg"), width=700, height=500)
plot(nom)
dev.off()
quartz_off_screen
2
Risk plot
pred_vals <- predvals_boot
graph_data <- mi_sum %>%
mutate(pred_vals = ifelse(pred_vals < -4, -4, pred_vals), # Censor extreme observations
pred_vals = ifelse(pred_vals > 1.5, 1.5, pred_vals), # Censor extreme observations
risk_group = cut(pred_vals,
breaks = 15)) %>%
group_by(risk_group) %>%
summarize(outcome_prob = mean(unsuccessful),
count = n(),
min_pred_val = min(pred_vals),
max_pred_val = max(pred_vals),
mean_pred_val = mean(pred_vals))
scaleFactor <- max(graph_data$count) / max(graph_data$outcome_prob)
text_size = 15
graph <- graph_data %>%
ggplot(aes(x=mean_pred_val, y=count)) +
geom_bar(stat="identity") +
geom_line(aes(y=outcome_prob*scaleFactor), color="tomato1", size=1) +
scale_y_continuous("Participant count", expand=c(0,0), sec.axis = sec_axis(~./scaleFactor, name = "Outcome probability (%)")) +
labs(x="Predicted value") +
theme(
axis.line.y.right = element_line(color = "tomato1"),
axis.ticks.y.right = element_line(color = "tomato1"),
axis.title.y.right=element_text(color="tomato1", size=text_size),
axis.title.y.left=element_text(size=text_size),
axis.title.x = element_text(size=text_size),
axis.text.y.right=element_text(color="tomato1", size=12),
axis.text.y.left = element_text(size=12),
axis.text.x = element_text(size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
graph
# Save plot
jpeg(here("images", "risk_plot.jpeg"), width=400, height=400)
graph
dev.off()
quartz_off_screen
2
#---- Save all results to Excel -----
coefficients <- coefficients_table
performance <- t_model_performance
# Create a blank workbook
library(openxlsx)
results <- createWorkbook()
# Add some sheets to the workbook
addWorksheet(results, "Model-Selection")
addWorksheet(results, "Coefficients")
addWorksheet(results, "Final-Model")
addWorksheet(results, "Performance")
addWorksheet(results, "Risk-Table")
# Write the data to the sheets
writeData(results, sheet = "Model-Selection", x = raw_table)
writeData(results, sheet = "Coefficients", x = coefficients)
writeData(results, sheet= "Final-Model", x = boot_coef_se)
writeData(results, sheet = "Performance", x = performance)
writeData(results, sheet = "Risk-Table", x=risk_groups)
# Export the file
saveWorkbook(results, here("tables", "main_analyses_results.xlsx"), overwrite=TRUE)